from dask_gateway import GatewayCluster, Gateway
import numpy as np
import s3fs
import xarray as xr
import hvplot.xarray
import hvplot as hv
import geoviews as gv
from geoviews import opts
import datashader as dsh
from holoviews.operation.datashader import rasterize
hv.extension('bokeh')
gv.extension('bokeh')
hv.output(size=150)
gv.output(size=300)Visualizing Zarr: Data from the Soil Moisture Active Passive Mission
Open in SMCE DaskHub (requires access)
The SMAP mission is an orbiting observatory that measures the amount of water in the surface soil everywhere on Earth.
This notebook demonstrates 2 strategies for to subselect data from a Zarr dataset in order to visualize using the memory of a notebook.
- Downsample the temporal resolution of the data using
xarray.DataArray.resample - Coarsening the spatial aspect of the data using
xarray.DataArray.coarsen
A strategy for visualizing any large amount of data is Datashader which bins data into a fixed 2-D array. The call to rasterize ensures the use of the datashader library to bin the data.
Compute environment
This notebook was written on the VEDA SMCE DaskHub and as such is designed to be run on a jupyterhub which is associated with an AWS IAM role which has been granted permissions to the VEDA data store via its bucket policy. The instance used provided 16GB of RAM.
Load libraries
Create and Scale a Dask Cluster
We create a dask cluster to speed up reprojecting the data (and other potential computations which could be required and are parallelizable).
gateway = Gateway()
clusters = gateway.list_clusters()
# connect to an existing cluster - this is useful when the kernel shutdown in the middle of an interactive session
if clusters:
cluster = gateway.connect(clusters[0].name)
else:
cluster = GatewayCluster(shutdown_on_close=True)
cluster.scale(16)
client = cluster.get_client()
print(cluster.dashboard_link)/services/dask-gateway/clusters/daskhub.836736d376e74ffe9386bf84683f21b3/status
Open the dataset from S3
s3 = s3fs.S3FileSystem()
root= 'veda-data-store-staging/EIS/zarr/SPL3SMP.zarr'
store = s3fs.S3Map(root=root, s3=s3)
ds = xr.open_zarr(store=store)
ds<xarray.Dataset>
Dimensions: (northing_m: 406, easting_m: 964,
datetime: 1679)
Coordinates:
* datetime (datetime) datetime64[ns] 2018-01-01 ... 2...
* easting_m (easting_m) float64 -1.735e+07 ... 1.735e+07
* northing_m (northing_m) float64 7.297e+06 ... -7.297e+06
Data variables: (12/26)
albedo (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
albedo_pm (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
bulk_density (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
bulk_density_pm (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
clay_fraction (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
clay_fraction_pm (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
... ...
static_water_body_fraction (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
static_water_body_fraction_pm (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
surface_flag (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
surface_flag_pm (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
surface_temperature (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>
surface_temperature_pm (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>Select the variable of interest (soil moisture for this example).
soil_moisture = ds['soil_moisture'].to_dataset(name='soil_moisture')
soil_moisture<xarray.Dataset>
Dimensions: (datetime: 1679, easting_m: 964, northing_m: 406)
Coordinates:
* datetime (datetime) datetime64[ns] 2018-01-01 ... 2022-09-09
* easting_m (easting_m) float64 -1.735e+07 -1.731e+07 ... 1.735e+07
* northing_m (northing_m) float64 7.297e+06 7.26e+06 ... -7.297e+06
Data variables:
soil_moisture (northing_m, easting_m, datetime) float32 dask.array<chunksize=(100, 100, 100), meta=np.ndarray>Strategy 1: Downsample the temporal resolution of the data
To plot one day from every month, resample the data to 1 observation a month.
somo_one_month = soil_moisture.resample(datetime="1M").nearest()Reproject the data for map visualization.
somo_one_month = somo_one_month.rename({'easting_m': 'longitude', 'northing_m': 'latitude'})
somo_one_month = somo_one_month.rio.write_crs("epsg:6933", inplace=True).transpose('datetime', 'latitude', 'longitude')
somo_reprojected = somo_one_month.rio.reproject("EPSG:4326")Create a geoviews dataset and visualize the data on a map.
kdims = ['datetime', 'x', 'y']
vdims = ['soil_moisture']
xr_dataset = gv.Dataset(somo_reprojected, kdims=kdims, vdims=vdims)
images = xr_dataset.to(gv.Image, ['x', 'y'])rasterize(images, precompute=True, aggregator=dsh.mean('soil_moisture')) * gv.feature.coastlineStrategy 2: Coarsen spatial resolution of the data
Below, we coarsen the spatial resolution of the data by a factor of 4 longitude and 2 latitude. These values were chosen because they can be used with the exact boundary argument as the dimensions size is a multiple of these values.
You can also coarsen by datetime, using the same strategy as below but replacing easting_m and northing_m with datetime. If {datetime: n} is the value give to the dim argument, this would create a mean of the soil moisture average for n days.
Once the data has been coarsned, again it is reprojected for map visualization and then visualized using Geoviews.
coarsened = soil_moisture.coarsen(dim={"easting_m": 4, "northing_m": 2}, boundary="exact").mean()
coarsened<xarray.Dataset>
Dimensions: (northing_m: 203, easting_m: 241, datetime: 1679)
Coordinates:
* datetime (datetime) datetime64[ns] 2018-01-01 ... 2022-09-09
* easting_m (easting_m) float64 -1.73e+07 -1.715e+07 ... 1.73e+07
* northing_m (northing_m) float64 7.279e+06 7.206e+06 ... -7.279e+06
Data variables:
soil_moisture (northing_m, easting_m, datetime) float32 dask.array<chunksize=(50, 25, 100), meta=np.ndarray>coarsened = coarsened.rename({'easting_m': 'longitude', 'northing_m': 'latitude'})
coarsened = coarsened.rio.write_crs("epsg:6933", inplace=True).transpose('datetime', 'latitude', 'longitude')
coarsened_reprojected = coarsened.rio.reproject("EPSG:4326")kdims = ['datetime', 'x', 'y']
vdims = ['soil_moisture']
xr_dataset = gv.Dataset(coarsened_reprojected, kdims=kdims, vdims=vdims)
images = xr_dataset.to(gv.Image, ['x', 'y'])rasterize(images, precompute=True, aggregator=dsh.mean('soil_moisture')) * gv.feature.coastlineCleanup
#client.shutdown()
#cluster.shutdown()